import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
file_one = pd.read_excel('1.xlsx', index_col=0)
file_two = pd.read_excel('2.xlsx', index_col=0)
merged_files = pd.merge(file_one,file_two, on=['Category', 'site'], suffixes=('_left', '_right'))
merged_files.replace('-' or '', np.nan, inplace=True)
merged_files['viewers_summarised'] = merged_files.iloc[:, 2:].apply(pd.to_numeric, errors='coerce').sum(axis=1)
merged_files['rank_most_viewed'] = merged_files['viewers_summarised'].rank(ascending=False).astype(int)
merged_files = merged_files.sort_values(by='rank_most_viewed', ascending=False)
cols_melt = [col for col in merged_files.columns if col not in ["site", "Category", "viewers_summarised", "rank_most_viewed"]]
df_over_time = merged_files.melt(
id_vars=["site", "Category"],
value_vars=cols_melt,
).sort_values(
by=["site", "Category", "variable"]
)
px.line(data_frame=df_over_time[df_over_time["Category"] == "Ranking ogólny"], x="variable", y="value", color="site")
category_grouped_with_value = df_over_time.groupby(["Category", "variable"])["value"].sum().reset_index()
fig = px.line(
category_grouped_with_value,
x="variable",
y="value",
color="Category",
title="Wykres liniowy wartości w czasie dla każdej unikalnej kategorii",
)
fig.show()
category_summarized = merged_files.groupby('Category')['viewers_summarised'].sum()
labels = category_summarized.index
values = category_summarized.values
categ_most_viewed = go.Figure(data=[go.Pie(labels=labels, values=values)])
categ_most_viewed.update_layout(title_text='Jakie kategorie serwisów są najcześciej wyświetlane przez użytkowników: ')
merged_files_view_sort = merged_files.sort_values(by='viewers_summarised', ascending=False)
merged_files_view_sort = merged_files_view_sort[merged_files_view_sort['site'] != 'All selected media channels']
top_20_viewers = merged_files_view_sort.head(20)
fig = go.Figure(data=[go.Bar(x=top_20_viewers['site'], y=top_20_viewers['viewers_summarised'])])
fig.update_layout(title_text='20 stron z największą liczbą realnych użytkowników ')
file = pd.read_excel('dane surowe.xlsx', index_col=0)
file['hit_timestamp'] = pd.to_datetime(file['hit_timestamp'], format='%d.%m.%Y %H:%M')
session_duration = file.groupby('session_id')['hit_timestamp'].max() - file.groupby('session_id')['hit_timestamp'].min()
file['session_duration'] = file['session_id'].map(session_duration)
bins = [0, 300, 600, 1200, float('inf')]
labels = ['Poniżej 5 minut', '5-10 minut', '10-20 minut', 'Powyżej 20 minut']
file['session_duration_category'] = pd.cut(file['session_duration'].dt.total_seconds(), bins=bins, labels=labels)
duration_counts = file['session_duration_category'].value_counts()
fig = px.pie(duration_counts, values=duration_counts.values, names=duration_counts.index,
labels={'label': 'Czas trwania sesji'}, title='Głębokość sesji',
hole=0.3, color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
session_source_counts = file['session_source_analytics'].value_counts()
fig = px.pie(session_source_counts, names=session_source_counts.index, values=session_source_counts.values,
title='Rozkład źródeł ruchu')
fig.show()
topic_counts = file['topic'].value_counts()
fig = px.pie(topic_counts, values=topic_counts.values, names=topic_counts.index, title='Najczęściej oglądane tematy: ')
fig.show()
unique_sessions = file.drop_duplicates(subset='session_id')
topic_counts = unique_sessions['topic'].value_counts().reset_index()
topic_counts.columns = ['topic', 'count']
fig = px.bar(topic_counts, x='topic', y='count', title='Najczęściej oglądane tematy: ')
fig.show()